R libraries.

library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)

library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())

library(svglite)

dat_path = "../../MorPhiC/data/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")

Read in meta data

meta = fread("data/December_2023/JAX_RNAseq_ExtraEmbryonic_meta_data.tsv", 
             data.table = FALSE)
dim(meta)
## [1] 90 36
meta[1:2,]
##    clonal.label library_preparation.label             label
## 1   MOK20010-WT                GT23-10491   PrS-MOK20010-WT
## 2 MOK20010C-A06                GT23-10506 PrS-MOK20010C-A06
##                                                                            description
## 1                                                  KOLF2.2 derived primitive syncytium
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE) derived primitive syncytium
##   differentiated_product_protocol_id model_system timepoint_value
## 1                           JAXPD001          PrS               6
## 2                           JAXPD001          PrS               6
##   timepoint_unit final_timepoint treatment_condition wt_control_status
## 1           days             Yes             hypoxia                WT
## 2           days             Yes      Not applicable                KO
##   ko_strategy ko_gene library_preparation.description
## 1          WT      WT                              NA
## 2          CE  POU2F3                              NA
##   library_preparation.library_preparation_protocol_id
## 1                                            JAXPL001
## 2                                            JAXPL001
##   library_preparation.average_fragment_size
## 1                                       431
## 2                                       428
##   library_preparation.input_amount_value library_preparation.input_amount_unit
## 1                                    300                                    ng
## 2                                    300                                    ng
##   library_preparation.concentration_value
## 1                                    47.7
## 2                                    47.7
##   library_preparation.concentration_unit library_preparation.total_yield_value
## 1                                     nM                                   244
## 2                                     nM                                   244
##   library_preparation.total_yield_unit library_preparation.cdna_pcr_cycles
## 1                                   ng                                  10
## 2                                   ng                                  10
##   library_preparation.pcr_cycles_for_indexing
## 1                               Not available
## 2                               Not available
##                                                file_id
## 1 KOLF2_POU2F3_1_GT23-10491_CCGTGAAG-ATCCACTG_S38_L001
## 2  POU2F3_CE_B05_GT23-10506_GCAGAATT-TGGCCGGT_S17_L001
##                        run_id
## 1 20230809_23-robson-005-run2
## 2 20230809_23-robson-005-run2
##                                         clonal.description clonal.parental_name
## 1                                                  KOLF2.2             KOLF2.2J
## 2 KOLF2.2 deleted POU2F3 by deletion of critical exon (CE)             KOLF2.2J
##   clonal.clone_id clonal.type clonal.zygosity
## 1              WT        iPSC  Not applicable
## 2             A06        iPSC  Not applicable
##   clonal.cell_line_generation_protocol clonal.treatment_condition
## 1                       Not applicable             Not applicable
## 2                       Not applicable             Not applicable
##   clonal.wt_control_status expression_alteration.label
## 1                       WT              Not applicable
## 2                       KO    JAX_POU2F3_Critical_exon
##                                 model_organ
## 1 extra-embryonic primitive syncytial cells
## 2 extra-embryonic primitive syncytial cells
names(meta)
##  [1] "clonal.label"                                       
##  [2] "library_preparation.label"                          
##  [3] "label"                                              
##  [4] "description"                                        
##  [5] "differentiated_product_protocol_id"                 
##  [6] "model_system"                                       
##  [7] "timepoint_value"                                    
##  [8] "timepoint_unit"                                     
##  [9] "final_timepoint"                                    
## [10] "treatment_condition"                                
## [11] "wt_control_status"                                  
## [12] "ko_strategy"                                        
## [13] "ko_gene"                                            
## [14] "library_preparation.description"                    
## [15] "library_preparation.library_preparation_protocol_id"
## [16] "library_preparation.average_fragment_size"          
## [17] "library_preparation.input_amount_value"             
## [18] "library_preparation.input_amount_unit"              
## [19] "library_preparation.concentration_value"            
## [20] "library_preparation.concentration_unit"             
## [21] "library_preparation.total_yield_value"              
## [22] "library_preparation.total_yield_unit"               
## [23] "library_preparation.cdna_pcr_cycles"                
## [24] "library_preparation.pcr_cycles_for_indexing"        
## [25] "file_id"                                            
## [26] "run_id"                                             
## [27] "clonal.description"                                 
## [28] "clonal.parental_name"                               
## [29] "clonal.clone_id"                                    
## [30] "clonal.type"                                        
## [31] "clonal.zygosity"                                    
## [32] "clonal.cell_line_generation_protocol"               
## [33] "clonal.treatment_condition"                         
## [34] "clonal.wt_control_status"                           
## [35] "expression_alteration.label"                        
## [36] "model_organ"
table(table(meta$clonal.label))
## 
##  1  2 
## 66 12
table(table(meta$library_preparation.label))
## 
##  1 
## 90
table(table(meta$label))
## 
##  1 
## 90

Read in gene count data and filter genes

Manually correcting the gene strings in certain column names.

As of the time of running this code, all differences below are due to partial correction of the GHRL gene name.

It should be “GRHL1” instead of “GHRL1”.

It is fixed in the file_id column in meta table, but not fixed in the count data file column names yet.

cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)
## [1] 38592    91
cts[1:2, c(1:2, (ncol(cts)-1):ncol(cts))]
##              Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674                                                   0
## 2 ENSG00000271254                                                1030
##   PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1                                                      0
## 2                                                    489
##   PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1                                                      0
## 2                                                    603
setdiff(names(cts)[-1], meta$file_id)
##  [1] "GHRL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001" 
##  [2] "KOLF2_GHRL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001" 
##  [3] "GHRL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001" 
##  [4] "GHRL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
##  [5] "GHRL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
##  [6] "GHRL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001" 
##  [7] "GHRL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001" 
##  [8] "GHRL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001" 
##  [9] "GHRL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001" 
## [10] "GHRL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001"
setdiff(meta$file_id, names(cts)[-1])
##  [1] "KOLF2_GRHL1_2_GT23-10492_TTACAGGA-GCTTGTCA_S6_L001" 
##  [2] "GRHL1_CE_A05_GT23-10497_TTGGACTC-CTGCTTCC_S12_L001" 
##  [3] "GRHL1_CE_A08_GT23-10498_CAGTAGGC-ATTCGTCA_S39_L001" 
##  [4] "GRHL1_CE_D07_GT23-10499_TGACGAAT-GCCTACTG_S31_L001" 
##  [5] "GRHL1_PTC_A09_GT23-10500_TAATACAG-GTGAATAT_S2_L001" 
##  [6] "GRHL1_PTC_A10_GT23-10501_CGGCGTGA-ACAGGCGC_S30_L001"
##  [7] "GRHL1_PTC_C09_GT23-10502_ATGTAAGT-CATAGAGT_S19_L001"
##  [8] "GRHL1_KO_D02_GT23-10494_TCTCTACT-GAACCGCG_S26_L001" 
##  [9] "GRHL1_KO_G04_GT23-10495_CTCTCGTC-AGGTTATA_S15_L001" 
## [10] "GRHL1_KO_H04_GT23-10496_CCAAGTCT-TCATCCTT_S35_L001"
## starting manually correcting count data file column names
names(cts) = gsub("GHRL", "GRHL", names(cts))
## end the manual correction

stopifnot(setequal(names(cts)[-1], meta$file_id))

meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)
## 
## TRUE 
##   90
cts_dat = data.matrix(cts[,-1])
rownames(cts_dat) = cts$Name

Discard genes whose expression is 0 in more than 75% of samples

model_s = meta$model_system
table(model_s, useNA="ifany")
## model_s
## ExM PrS 
##  12  78
get_q75 <- function(x){quantile(x, probs = 0.75)}

gene_info = data.frame(Name = cts$Name, 
                       t(apply(cts_dat, 1, tapply, model_s, min)), 
                       t(apply(cts_dat, 1, tapply, model_s, median)), 
                       t(apply(cts_dat, 1, tapply, model_s, get_q75)), 
                       t(apply(cts_dat, 1, tapply, model_s, max)))

dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM PrS ExM.1 PrS.1  ExM.2 PrS.2 ExM.3 PrS.3
## ENSG00000268674 ENSG00000268674   0   0   0.0   0.0   0.00     0     0     3
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25   923   948  1169
table(row.names(gene_info)==gene_info$Name, useNA="ifany")
## 
##  TRUE 
## 38592
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4), 
                               rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)
## [1] 38592     9
gene_info[1:2,]
##                            Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674       0       0        0.0        0.0    0.00
## ENSG00000271254 ENSG00000271254     634     387      812.5      775.5  848.25
##                 PrS_q75 ExM_max PrS_max
## ENSG00000268674       0       0       3
## ENSG00000271254     923     948    1169
summary(gene_info[,-1])
##     ExM_min            PrS_min           ExM_median         PrS_median      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     0.0   Median :     0.0   Median :     2.0   Median :     1.0  
##  Mean   :   376.3   Mean   :   255.1   Mean   :   530.5   Mean   :   585.1  
##  3rd Qu.:   120.0   3rd Qu.:    60.0   3rd Qu.:   188.5   3rd Qu.:   182.5  
##  Max.   :512761.0   Max.   :154723.0   Max.   :797550.5   Max.   :379480.0  
##     ExM_q75            PrS_q75            ExM_max            PrS_max      
##  Min.   :     0.0   Min.   :     0.0   Min.   :     0.0   Min.   :     0  
##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     1  
##  Median :     3.0   Median :     3.0   Median :     6.0   Median :    10  
##  Mean   :   608.8   Mean   :   712.7   Mean   :   751.6   Mean   :  1102  
##  3rd Qu.:   228.0   3rd Qu.:   233.8   3rd Qu.:   285.0   3rd Qu.:   405  
##  Max.   :886970.0   Max.   :456738.2   Max.   :928104.0   Max.   :801404
table(gene_info$ExM_q75 > 0, gene_info$PrS_q75 > 0)
##        
##         FALSE  TRUE
##   FALSE 12757   959
##   TRUE   2139 22737
w2kp = which(gene_info$ExM_q75 > 0 | gene_info$PrS_q75 > 0)
cts_dat = cts_dat[w2kp,]
dim(cts_dat)
## [1] 25835    90
gene_info = gene_info[w2kp,]

Read in gene annotation

gene_anno = fread("data/gencode_v44_primary_assembly_info.tsv")
dim(gene_anno)
## [1] 62754     8
gene_anno[1:2,]
##                geneId    chr strand     start       end ensembl_gene_id
##                <char> <char> <char>     <int>     <int>          <char>
## 1: ENSG00000000003.16   chrX      - 100627108 100639991 ENSG00000000003
## 2:  ENSG00000000005.6   chrX      + 100584936 100599885 ENSG00000000005
##    hgnc_symbol                                       description
##         <char>                                            <char>
## 1:      TSPAN6 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2:        TNMD   tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]
table(gene_info$Name %in% gene_anno$ensembl_gene_id)
## 
##  TRUE 
## 25835
# all ensembl gene IDs are contained in the annotation
setdiff(gene_info$Name, gene_anno$ensembl_gene_id)
## character(0)
gene_info = merge(gene_anno, gene_info, by.x="ensembl_gene_id", by.y = "Name", 
                  all.x = FALSE, all.y = TRUE)
dim(gene_info)
## [1] 25835    16
gene_info[1:2,]
## Key: <ensembl_gene_id>
##    ensembl_gene_id             geneId    chr strand     start       end
##             <char>             <char> <char> <char>     <int>     <int>
## 1: ENSG00000000003 ENSG00000000003.16   chrX      - 100627108 100639991
## 2: ENSG00000000419 ENSG00000000419.14  chr20      -  50934867  50959140
##    hgnc_symbol
##         <char>
## 1:      TSPAN6
## 2:        DPM1
##                                                                                       description
##                                                                                            <char>
## 1:                                              tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]
## 2: dolichyl-phosphate mannosyltransferase subunit 1, catalytic [Source:HGNC Symbol;Acc:HGNC:3005]
##    ExM_min PrS_min ExM_median PrS_median ExM_q75 PrS_q75 ExM_max PrS_max
##      <int>   <int>      <num>      <num>   <num>   <num>   <int>   <int>
## 1:    1181     739       1650     1190.0 1730.50 1347.00    1889    1733
## 2:     674     781        744     1216.5  817.25 1359.75    1033    1952
gene_info[gene_info$ExM_min > 2e4, c("ExM_min", "hgnc_symbol")]
##     ExM_min hgnc_symbol
##       <int>      <char>
##  1:   22327      PABPC1
##  2:   74260        ACTB
##  3:   20930    HSP90AA1
##  4:   28311        MMP2
##  5:   27746       TIMP3
##  6:   62150        MYH9
##  7:   22345    SERPINE1
##  8:   44101      COL1A1
##  9:   21494       HSPA8
## 10:   28495      EIF4G2
## 11:   21084       KRT18
## 12:   23312       GAPDH
## 13:   52349       SPARC
## 14:  512761         FN1
## 15:   21893       CALD1
## 16:   21476        LRP1
## 17:   67802       AHNAK
## 18:   21821       MACF1
## 19:   42812      COL5A1
## 20:   26322       MAP1B
## 21:   43164      COL4A2
## 22:   41582        FLNB
## 23:   35934        TPM1
## 24:   43713       ITGB1
## 25:   35160         DST
## 26:  115551      EEF1A1
## 27:   33223       ACTC1
## 28:   68632      COL1A2
## 29:   25034       YWHAZ
## 30:   26937        EEF2
## 31:   30376        KRT8
## 32:   21806        CALR
## 33:   21608       ANXA2
## 34:   45334       ACTG1
## 35:   56039      COL4A1
## 36:   42340        FLNA
## 37:   25425     DYNC1H1
## 38:   64575      MT-CO2
## 39:   59989      MT-CYB
## 40:   44890      MT-ND2
## 41:   62659      MT-ND5
## 42:  255443      MT-CO1
## 43:  147599      MT-ND4
## 44:   61144      MT-ND1
## 45:   58951     MT-ATP6
## 46:  120331      MT-CO3
## 47:   27357       NEAT1
## 48:  263389      MALAT1
## 49:  111980            
##     ExM_min hgnc_symbol
gene_info[gene_info$PrS_min > 2e4, c("PrS_min", "hgnc_symbol")]
##     PrS_min hgnc_symbol
##       <int>      <char>
##  1:   41173      PABPC1
##  2:   30591        ACTB
##  3:   25451      PAPOLA
##  4:   23823         SCD
##  5:   28878      EIF4G2
##  6:   37637       KRT18
##  7:   26632       GAPDH
##  8:   51812       AHNAK
##  9:   24707         H19
## 10:   92177      EEF1A1
## 11:   23950       YWHAZ
## 12:   21852     HSP90B1
## 13:   23790        EEF2
## 14:   38248        KRT8
## 15:   27307        CALR
## 16:   20647       ACTG1
## 17:   20508      MT-ND5
## 18:   75374      MT-CO1
## 19:   50209      MT-ND4
## 20:   27870      MT-CO3
## 21:  154723      MALAT1
## 22:   88631            
##     PrS_min hgnc_symbol
gene_info$gene_symbol = gene_info$hgnc_symbol

table(gene_info$gene_symbol == "")
## 
## FALSE  TRUE 
## 19957  5878
table(is.na(gene_info$gene_symbol))
## 
## FALSE 
## 25835
wEns = which(gene_info$gene_symbol == "" | is.na(gene_info$gene_symbol))
gene_info$gene_symbol[wEns] = gene_info$ensembl_gene_id[wEns]

table(gene_info$gene_symbol == "")
## 
## FALSE 
## 25835
table(is.na(gene_info$gene_symbol))
## 
## FALSE 
## 25835

Prepare for reading in DE results

Use each combination of (model system, condition) as DE group.

Run analysis for each DE group separately.

The specific DE groups can be found in the file with naming in the format:

${dataset_name}_DE_n_samples.tsv

There are also DE results between samples under normoxia and hypoxia conditions.

release_name = "2023_12_JAX_RNAseq_ExtraEmbryonic"
result_dir = file.path("results", release_name)
processed_output_dir = file.path(result_dir, "processed")

all_result_files = list.files(path=processed_output_dir, pattern=".tsv", 
                              all.files=TRUE, full.names=FALSE)
all_result_files = sort(all_result_files)

regular_DE_files = all_result_files[!grepl("hyp_vs_nor", all_result_files)]
condition_DE_files = all_result_files[grepl("hyp_vs_nor", all_result_files)]

extract_cores <- function(x){
  x_split = str_split(x, "_")[[1]]
  x_start = 6
  x_end = length(x_split)-1
  x_d_group = paste(x_split[x_start:(x_end-1)], collapse="_")
  x_gene = x_split[x_end]
  return(c(x_d_group, x_gene))
}

df_file_info = NULL

for (x in regular_DE_files){
  df_file_info = rbind(df_file_info, extract_cores(x))
}

df_file_info = as.data.frame(df_file_info)
colnames(df_file_info) = c("DE_general_group", "gene")
df_file_info
##   DE_general_group   gene
## 1    ExM_day_6_nor   ISL1
## 2    PrS_day_6_hyp  EPAS1
## 3    PrS_day_6_nor  EPAS1
## 4    PrS_day_6_nor   FOSB
## 5    PrS_day_6_nor   GCM1
## 6    PrS_day_6_nor  GRHL1
## 7    PrS_day_6_nor POU2F3
## 8    PrS_day_6_nor  PPARG
extract_cores_cond <- function(x){
  x_split = str_split(x, "_")[[1]]
  x_start = 6
  x_end = length(x_split)-4
  x_d_group = paste(x_split[x_start:(x_start+2)], collapse="_")
  x_gene = x_split[x_end-1]
  x_gene_strategy = paste(x_split[(x_end-1):x_end], collapse="_")
  return(c(x_d_group, x_gene, x_gene_strategy))
}

df_file_info_cond = NULL

for (x in condition_DE_files){
  df_file_info_cond = rbind(df_file_info_cond, extract_cores_cond(x))
}

df_file_info_cond = as.data.frame(df_file_info_cond)
colnames(df_file_info_cond) = c("DE_general_group", "gene", "gene_strategy")
df_file_info_cond
##   DE_general_group  gene gene_strategy
## 1        PrS_day_6 EPAS1      EPAS1_CE
## 2        PrS_day_6 EPAS1      EPAS1_KO
## 3        PrS_day_6 EPAS1     EPAS1_PTC
## 4        PrS_day_6    WT         WT_WT

Prepare output directories for saving figure files out

df_size = NULL

figure_dir = file.path("figures", release_name)
volcano_figure_dir = file.path(figure_dir, "volcano_plots")

if (!file.exists(volcano_figure_dir)){
  dir.create(volcano_figure_dir, recursive = TRUE)
}

Number of DE genes and properties of knockout gene

For DE between knockout and WT samples

Plot the knockout genes from different DE groups all together in one figure.

df_file_info$items = paste(df_file_info$DE_general_group, 
                           df_file_info$gene, sep="_")
df_file_info$items
## [1] "ExM_day_6_nor_ISL1"   "PrS_day_6_hyp_EPAS1"  "PrS_day_6_nor_EPAS1" 
## [4] "PrS_day_6_nor_FOSB"   "PrS_day_6_nor_GCM1"   "PrS_day_6_nor_GRHL1" 
## [7] "PrS_day_6_nor_POU2F3" "PrS_day_6_nor_PPARG"
table(table(df_file_info$items))
## 
## 1 
## 8
fc0 = log2(1.5)
p0  = 0.05

gene_symbols = df_file_info$gene
gene_symbols
## [1] "ISL1"   "EPAS1"  "EPAS1"  "FOSB"   "GCM1"   "GRHL1"  "POU2F3" "PPARG"
table(gene_symbols %in% gene_info$gene_symbol)
## 
## TRUE 
##    8
genes_w_multi_ensembl = names(which(table(gene_info$gene_symbol)>1))
genes_w_multi_ensembl
## [1] "GOLGA8M"      "LINC01238"    "TNFRSF10A-DT"
df = data.frame(KO = df_file_info$items, 
                group = df_file_info$DE_general_group,
                gene_symbol = df_file_info$gene, 
                CE_nDE = NA, KO_nDE = NA, PTC_nDE = NA, 
                CE_padj = NA, CE_log2FoldChange = NA, 
                KO_padj = NA, KO_log2FoldChange = NA, 
                PTC_padj = NA, PTC_log2FoldChange = NA)

df$Model = str_extract(df$KO, "^[a-zA-Z0-9]+")

gene_ids = gene_info$ensembl_gene_id[match(df_file_info$gene, gene_info$gene_symbol)]
gene_ids
## [1] "ENSG00000016082" "ENSG00000116016" "ENSG00000116016" "ENSG00000125740"
## [5] "ENSG00000137270" "ENSG00000134317" "ENSG00000137709" "ENSG00000132170"
for (k in 1:nrow(df_file_info)){
    
  res = fread(file.path(processed_output_dir,
                        paste0(release_name, "_", df_file_info$items[k], "_DEseq2.tsv")),
                        data.table = FALSE)
  print(res[1:2,])
  
  gene_id = gene_ids[k]
  w_gene = which(res$gene_ID == gene_id)
  stopifnot(length(w_gene) == 1)
  stopifnot(!(df$gene_symbol[k]%in%genes_w_multi_ensembl))
  
  for(ko1 in c("CE", "KO", "PTC")){
    
    fc = paste0(df$gene_symbol[k], "_", ko1, "_log2FoldChange")
    padj = paste0(df$gene_symbol[k], "_", ko1, "_padj")
    col_name = paste0(ko1, "_nDE")
    df[k,col_name] = sum(abs(res[[fc]]) > fc0 & res[[padj]] < p0, na.rm = TRUE)
    
    col_name = paste0(ko1, "_log2FoldChange")
    df[k,col_name] = res[[fc]][w_gene]
    
    col_name = paste0(ko1, "_padj")
    df[k,col_name] = res[[padj]][w_gene]
    
  }
  
}
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   ISL1_CE_log2FoldChange ISL1_CE_pvalue ISL1_CE_padj ISL1_KO_log2FoldChange
## 1                  0.280         0.0102        0.104                  0.299
## 2                 -0.191         0.5370        0.749                 -0.660
##   ISL1_KO_pvalue ISL1_KO_padj ISL1_PTC_log2FoldChange ISL1_PTC_pvalue
## 1        0.00615       0.0989                   0.224          0.0412
## 2        0.03950       0.2780                  -0.160          0.6070
##   ISL1_PTC_padj
## 1         0.286
## 2         0.845
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   EPAS1_CE_log2FoldChange EPAS1_CE_pvalue EPAS1_CE_padj EPAS1_KO_log2FoldChange
## 1                  -0.371         0.00184        0.0100                  -0.397
## 2                  -1.460         0.01890        0.0611                  -1.960
##   EPAS1_KO_pvalue EPAS1_KO_padj EPAS1_PTC_log2FoldChange EPAS1_PTC_pvalue
## 1        0.000912       0.00941                   -0.231          0.06600
## 2        0.002040       0.01790                   -1.720          0.00877
##   EPAS1_PTC_padj
## 1         0.2100
## 2         0.0499
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   EPAS1_CE_log2FoldChange EPAS1_CE_pvalue EPAS1_CE_padj EPAS1_KO_log2FoldChange
## 1                   0.109           0.252         0.443                 0.00737
## 2                  -0.544           0.164         0.333                -0.39500
##   EPAS1_KO_pvalue EPAS1_KO_padj EPAS1_PTC_log2FoldChange EPAS1_PTC_pvalue
## 1           0.939         0.988                    0.114           0.2270
## 2           0.308            NA                   -0.759           0.0558
##   EPAS1_PTC_padj
## 1          0.547
## 2          0.294
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   FOSB_CE_log2FoldChange FOSB_CE_pvalue FOSB_CE_padj FOSB_KO_log2FoldChange
## 1                 -0.156          0.107         0.34                 -0.231
## 2                  0.251          0.517         0.74                  0.361
##   FOSB_KO_pvalue FOSB_KO_padj FOSB_PTC_log2FoldChange FOSB_PTC_pvalue
## 1         0.0171        0.129                 -0.0168           0.860
## 2         0.3480        0.650                  0.2970           0.437
##   FOSB_PTC_padj
## 1         0.984
## 2         0.903
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   GCM1_CE_log2FoldChange GCM1_CE_pvalue GCM1_CE_padj GCM1_KO_log2FoldChange
## 1                  0.160          0.100        0.187                 0.0458
## 2                 -0.372          0.367        0.507                -0.3690
##   GCM1_KO_pvalue GCM1_KO_padj GCM1_PTC_log2FoldChange GCM1_PTC_pvalue
## 1          0.649        0.773                  0.0469           0.637
## 2          0.383        0.546                 -0.8320           0.055
##   GCM1_PTC_padj
## 1          0.77
## 2          0.13
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   GRHL1_CE_log2FoldChange GRHL1_CE_pvalue GRHL1_CE_padj GRHL1_KO_log2FoldChange
## 1                  -0.706        3.08e-11      2.79e-09                  -0.805
## 2                   1.020        1.30e-02      4.51e-02                   1.200
##   GRHL1_KO_pvalue GRHL1_KO_padj GRHL1_PTC_log2FoldChange GRHL1_PTC_pvalue
## 1        1.00e-15      2.51e-13                   -0.879         5.98e-18
## 2        1.63e-03      1.15e-02                    1.070         5.70e-03
##   GRHL1_PTC_padj
## 1       1.15e-15
## 2       1.75e-02
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   POU2F3_CE_log2FoldChange POU2F3_CE_pvalue POU2F3_CE_padj
## 1                   -0.244           0.0144         0.0578
## 2                    0.229           0.5650         0.7050
##   POU2F3_KO_log2FoldChange POU2F3_KO_pvalue POU2F3_KO_padj
## 1                   -0.162            0.097          0.228
## 2                    0.060            0.879          0.928
##   POU2F3_PTC_log2FoldChange POU2F3_PTC_pvalue POU2F3_PTC_padj
## 1                    -0.163            0.0855           0.366
## 2                     0.346            0.3590           0.669
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   PPARG_CE_log2FoldChange PPARG_CE_pvalue PPARG_CE_padj PPARG_KO_log2FoldChange
## 1                  0.0708           0.455         0.588                   0.063
## 2                  0.2240           0.496         0.624                   0.752
##   PPARG_KO_pvalue PPARG_KO_padj PPARG_PTC_log2FoldChange PPARG_PTC_pvalue
## 1           0.507        0.6680                  -0.0541            0.568
## 2           0.023        0.0707                   0.4520            0.167
##   PPARG_PTC_padj
## 1          0.700
## 2          0.291
print(df)
##                     KO         group gene_symbol CE_nDE KO_nDE PTC_nDE  CE_padj
## 1   ExM_day_6_nor_ISL1 ExM_day_6_nor        ISL1    652    405     432 1.05e-10
## 2  PrS_day_6_hyp_EPAS1 PrS_day_6_hyp       EPAS1   4196   2140    2553 1.53e-20
## 3  PrS_day_6_nor_EPAS1 PrS_day_6_nor       EPAS1   1372     40     304 2.57e-16
## 4   PrS_day_6_nor_FOSB PrS_day_6_nor        FOSB    409    847     121 1.75e-01
## 5   PrS_day_6_nor_GCM1 PrS_day_6_nor        GCM1   5299   4486    4255 2.02e-04
## 6  PrS_day_6_nor_GRHL1 PrS_day_6_nor       GRHL1   3439   2483    5095 1.35e-15
## 7 PrS_day_6_nor_POU2F3 PrS_day_6_nor      POU2F3   2038    590       6 1.71e-07
## 8  PrS_day_6_nor_PPARG PrS_day_6_nor       PPARG   4503   3029    4014 8.64e-01
##   CE_log2FoldChange  KO_padj KO_log2FoldChange PTC_padj PTC_log2FoldChange
## 1           -1.6300 2.88e-07            -1.350 5.20e-14             -1.870
## 2           -2.5400 7.20e-58            -4.310 1.29e-11             -2.010
## 3           -2.2800 1.27e-65            -4.420 2.14e-10             -1.870
## 4           -0.6810 1.90e-18            -2.970 4.95e-01              0.592
## 5            0.6700 3.65e-01             0.208 1.07e-14             -1.360
## 6           -1.7700 2.82e-56            -3.130 7.09e-25             -2.100
## 7           -1.9700 9.64e-51            -5.960 5.02e-02             -1.110
## 8            0.0235 7.92e-09             0.571 1.98e-02             -0.251
##   Model
## 1   ExM
## 2   PrS
## 3   PrS
## 4   PrS
## 5   PrS
## 6   PrS
## 7   PrS
## 8   PrS
p1 <- ggplot(df, aes(x=(PTC_nDE), y=(CE_nDE), 
               label=gene_symbol, color=group)) + 
      geom_point() + scale_color_brewer(palette = "Dark2")+
      geom_text_repel(point.padding = 0.5, min.segment.length = 0, 
                      size = 3,  # Adjust text size
                      box.padding = 0.5, 
                      max.overlaps = 20, show.legend = FALSE) + 
      ggtitle("# of DE genes") + 
      labs(x="PTC", y="CE") + 
      geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")

p2 <- ggplot(df, aes(x=(PTC_nDE), y=(KO_nDE), 
               label=gene_symbol, color=group)) + 
      geom_point() + scale_color_brewer(palette = "Dark2")+
      geom_text_repel(point.padding = 0.5, min.segment.length = 0, 
                      size = 3,  # Adjust text size
                      box.padding = 0.5, 
                      max.overlaps = 20, show.legend = FALSE) + 
      ggtitle("# of DE genes") + 
      labs(x="PTC", y="KO") + 
      geom_abline(intercept = 0, slope = 1, color = "orange", linetype = "dashed")

p3 <- ggplot(df, aes(x=CE_log2FoldChange, y=-log10(CE_padj), 
               label=gene_symbol, color=group)) + 
      geom_point() + scale_color_brewer(palette = "Dark2")+
      geom_text_repel(point.padding = 0.5, min.segment.length = 0, 
                      size = 3,  # Adjust text size
                      box.padding = 0.5, 
                      max.overlaps = 20, show.legend = FALSE) + 
      ggtitle("DE results of the KO genes, CE") + 
      labs(x="log2 fold change", y="-log10(adjusted p-value)")  + 
      geom_hline(yintercept = 2, color = "grey", linetype = "dashed") + 
      geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
                 linetype = "dashed")
  
p4 <- ggplot(df, aes(x=KO_log2FoldChange, y=-log10(KO_padj), 
               label=gene_symbol, color=group)) + 
      geom_point() + scale_color_brewer(palette = "Dark2")+
      geom_text_repel(point.padding = 0.5, min.segment.length = 0, 
                      size = 3,  # Adjust text size
                      box.padding = 0.5, 
                      max.overlaps = 20, show.legend = FALSE) + 
      ggtitle("DE results of the KO genes, KO") + 
      labs(x="log2 fold change", y="-log10(adjusted p-value)")  + 
      geom_hline(yintercept = 2, color = "grey", linetype = "dashed") + 
      geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
                 linetype = "dashed")

  
p5 <- ggplot(df, aes(x=PTC_log2FoldChange, y=-log10(PTC_padj), 
               label=gene_symbol, color=group)) + 
      geom_point() + scale_color_brewer(palette = "Dark2")+
      geom_text_repel(point.padding = 0.5, min.segment.length = 0, 
                      size = 3,  # Adjust text size
                      box.padding = 0.5, 
                      max.overlaps = 20, show.legend = FALSE) + 
      ggtitle("DE results of the KO genes, PTC") + 
      labs(x="log2 fold change", y="-log10(adjusted p-value)")  + 
      geom_hline(yintercept = 2, color = "grey", linetype = "dashed") + 
      geom_vline(xintercept = c(log2(1/1.5), log2(1.5)), color = "grey",
                 linetype = "dashed")

g_combined <- ggarrange(p1, p2, p3, p4, p5, ncol = 2, nrow = 3)
print(g_combined)

For DE between samples under different conditions

Barchart for number of DE genes

df_file_info_cond$items = paste(df_file_info_cond$DE_general_group, 
                                df_file_info_cond$gene_strategy, sep="_")
df_file_info_cond$items
## [1] "PrS_day_6_EPAS1_CE"  "PrS_day_6_EPAS1_KO"  "PrS_day_6_EPAS1_PTC"
## [4] "PrS_day_6_WT_WT"
fc0 = log2(1.5)
p0  = 0.05

df_cond = data.frame(gene_strategy = df_file_info_cond$gene_strategy, 
                     nDE = NA)

for (k in 1:nrow(df_file_info_cond)){
    
  res = fread(file.path(processed_output_dir,
                        paste0(release_name, "_", 
                               df_file_info_cond$items[k], 
                               "_hyp_vs_nor_DEseq2.tsv")),
                        data.table = FALSE)
  print(res[1:2,])
  fc = "hyp_vs_nor_log2FoldChange"
  padj = "hyp_vs_nor_padj"
  df_cond[k,"nDE"] = sum(abs(res[[fc]]) > fc0 & res[[padj]] < p0, na.rm = TRUE)
  
}
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1                    -0.584          0.000173        0.000885
## 2                     0.353          0.618000        0.751000
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1                    -0.447            0.0276           0.106
## 2                    -0.355            0.5510           0.732
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1                    -0.476           0.00143         0.00652
## 2                     0.506           0.38100         0.53700
##           gene_ID hgnc_symbol        chr strand  start    end
## 1 ENSG00000271254             KI270711.1      -   4612  29626
## 2 ENSG00000277196             KI270734.1      - 138082 161852
##   hyp_vs_nor_log2FoldChange hyp_vs_nor_pvalue hyp_vs_nor_padj
## 1                   -0.0979            0.4930          0.6120
## 2                    1.1500            0.0105          0.0275
df_cond
##   gene_strategy  nDE
## 1      EPAS1_CE 3518
## 2      EPAS1_KO 2518
## 3     EPAS1_PTC 3664
## 4         WT_WT 6724
p1 <- ggplot(df_cond, aes(x=gene_strategy, y=nDE, fill=gene_strategy)) +
      geom_bar(stat="identity")+theme_classic() + 
      ylab("Number of DE genes")+
      ggtitle("DE hyp vs nor") + 
      theme(axis.title.x=element_blank(),
            axis.text.x=element_blank(),
            axis.ticks.x=element_blank())

print(p1)

Generate volcano plots

For DE between knockout and WT samples

df_file_info
##   DE_general_group   gene                items
## 1    ExM_day_6_nor   ISL1   ExM_day_6_nor_ISL1
## 2    PrS_day_6_hyp  EPAS1  PrS_day_6_hyp_EPAS1
## 3    PrS_day_6_nor  EPAS1  PrS_day_6_nor_EPAS1
## 4    PrS_day_6_nor   FOSB   PrS_day_6_nor_FOSB
## 5    PrS_day_6_nor   GCM1   PrS_day_6_nor_GCM1
## 6    PrS_day_6_nor  GRHL1  PrS_day_6_nor_GRHL1
## 7    PrS_day_6_nor POU2F3 PrS_day_6_nor_POU2F3
## 8    PrS_day_6_nor  PPARG  PrS_day_6_nor_PPARG
fc0
## [1] 0.5849625
p0
## [1] 0.05
for(k in 1:nrow(df_file_info)){
  
  it_k = df_file_info$items[k]
  d_group = df_file_info$DE_general_group[k]
  gene_name = df_file_info$gene[k]
  
  print(it_k)
  
  res = fread(file.path(processed_output_dir,
                        paste0(release_name, "_", it_k, "_DEseq2.tsv")),
              data.table = FALSE)

  for (ko1 in c("CE", "KO", "PTC")){

    res_k = res[,c(1, grep(ko1, names(res)))]
    names(res_k) = gsub(paste0(gene_name, "_", ko1, "_"), "", names(res_k))

    ww_up = which((res_k$log2FoldChange > fc0) & (res_k$padj < p0))
    ww_down = which((res_k$log2FoldChange < ((-1)*fc0)) & (res_k$padj < p0))
    
    res_k$DE = "No"
    res_k$DE[ww_up] <- "Up"
    res_k$DE[ww_down] <- "Down"
    
    res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
    res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3
  
    col2use = c("blue", "grey", "red")
    names(col2use) = c("Down", "No", "Up")
    
    res_k$delabel = NA
    res_k$gene_symbol = gene_info$gene_symbol[match(res_k$gene_ID, 
                                                    gene_info$ensembl_gene_id)]
    w_de = which(res_k$DE != "No")
    res_k$delabel[w_de] = res_k$gene_symbol[w_de]
  
    print(table(res_k$DE))
    
    if (gene_name=="EPAS1"){
      d_group_in_title = d_group
    }else{
      d_group_in_title = gsub("_nor", "", d_group)
    }
    
    g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj), 
                      col=DE, label=delabel)) + 
           geom_point() + ggtitle(paste0(d_group_in_title, "\n", gene_name, "_", ko1)) + 
           geom_text_repel(point.padding = 0.5,
                           min.segment.length = 0, 
                           size = 3,  # Adjust text size
                           box.padding = 0.5,
                           max.overlaps = 20, 
                           show.legend = FALSE) + 
           scale_color_manual(values=col2use)
      
    print(g2)

  }
  
  saved_figure_name = paste0(d_group_in_title, "_", gene_name, "_", ko1, ".svg")
  ggsave(file=file.path(volcano_figure_dir, saved_figure_name), 
         plot=g2, width=4, height=3, units="in")
}
## [1] "ExM_day_6_nor_ISL1"
## 
##  Down    No    Up 
##   312 23190   340
## Warning: Removed 6968 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23190 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 640 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##   240 23437   165
## Warning: Removed 8340 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23437 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 396 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##   251 23410   181
## Warning: Removed 7420 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23410 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 423 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 7420 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23410 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 423 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_hyp_EPAS1"
## 
##  Down    No    Up 
##  1930 19130  2266
## Warning: Removed 4990 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19130 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4195 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##   834 21186  1306
## Warning: Removed 5434 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21186 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2137 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  1006 20773  1547
## Warning: Removed 5433 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20773 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2550 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 5433 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20773 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2550 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_nor_EPAS1"
## 
##  Down    No    Up 
##   603 22121   769
## Warning: Removed 7321 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22121 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 1364 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##    22 23453    18
## Warning: Removed 11844 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23453 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 36 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##   161 23189   143
## Warning: Removed 8668 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23189 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 294 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 8668 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23189 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 294 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_nor_FOSB"
## 
##  Down    No    Up 
##    48 23084   361
## Warning: Removed 6866 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23084 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 401 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##   256 22646   591
## Warning: Removed 5991 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22646 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 832 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##    34 23372    87
## Warning: Removed 6865 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23372 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 6865 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23372 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 108 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_nor_GCM1"
## 
##  Down    No    Up 
##  3036 18194  2263
## Warning: Removed 4570 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18194 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 5299 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  2704 19007  1782
## Warning: Removed 4564 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19007 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4486 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  2599 19238  1656
## Warning: Removed 4514 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19238 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4255 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4514 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19238 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4255 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_nor_GRHL1"
## 
##  Down    No    Up 
##  2055 20054  1384
## Warning: Removed 4556 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20054 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3437 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  1449 21010  1034
## Warning: Removed 4619 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21010 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2482 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  2740 18398  2355
## Warning: Removed 4504 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18398 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 5092 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4504 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18398 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 5092 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_nor_POU2F3"
## 
##  Down    No    Up 
##  1098 21455   940
## Warning: Removed 6005 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21455 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2036 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##   225 22903   365
## Warning: Removed 8668 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22903 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 589 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##     4 23487     2
## Warning: Removed 4505 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23487 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: Removed 4505 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 23487 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).

## [1] "PrS_day_6_nor_PPARG"
## 
##  Down    No    Up 
##  2296 18990  2207
## Warning: Removed 4429 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18990 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4502 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  1634 20464  1395
## Warning: Removed 4613 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20464 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3027 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## 
##  Down    No    Up 
##  2342 19479  1672
## Warning: Removed 4460 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19479 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4013 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4460 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 19479 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 4013 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

For DE between samples under different conditions

df_file_info_cond
##   DE_general_group  gene gene_strategy               items
## 1        PrS_day_6 EPAS1      EPAS1_CE  PrS_day_6_EPAS1_CE
## 2        PrS_day_6 EPAS1      EPAS1_KO  PrS_day_6_EPAS1_KO
## 3        PrS_day_6 EPAS1     EPAS1_PTC PrS_day_6_EPAS1_PTC
## 4        PrS_day_6    WT         WT_WT     PrS_day_6_WT_WT
fc0
## [1] 0.5849625
p0
## [1] 0.05
for(k in 1:nrow(df_file_info_cond)){
  
  it_k = df_file_info_cond$items[k]
  d_group = df_file_info_cond$DE_general_group[k]
  gene_strategy_name = df_file_info_cond$gene_strategy[k]
  
  print(it_k)
  
  res = fread(file.path(processed_output_dir,
                        paste0(release_name, "_", 
                               df_file_info_cond$items[k], 
                               "_hyp_vs_nor_DEseq2.tsv")),
                        data.table = FALSE)  

  res_k = res[,c(1, grep("hyp_vs_nor", names(res)))]
  names(res_k) = gsub("hyp_vs_nor_", "", names(res_k))

  ww_up = which((res_k$log2FoldChange > fc0) & (res_k$padj < p0))
  ww_down = which((res_k$log2FoldChange < ((-1)*fc0)) & (res_k$padj < p0))
    
  res_k$DE = "No"
  res_k$DE[ww_up] <- "Up"
  res_k$DE[ww_down] <- "Down"
    
  res_k$log2FoldChange[which(res_k$log2FoldChange > 3)] = 3
  res_k$log2FoldChange[which(res_k$log2FoldChange < -3)] = -3

  col2use = c("blue", "grey", "red")
  names(col2use) = c("Down", "No", "Up")
    
  res_k$delabel = NA
  res_k$gene_symbol = gene_info$gene_symbol[match(res_k$gene_ID, 
                                                  gene_info$ensembl_gene_id)]
  w_de = which(res_k$DE != "No")
  res_k$delabel[w_de] = res_k$gene_symbol[w_de]

  print(table(res_k$DE))
    
  g2 = ggplot(res_k, aes(x=log2FoldChange, y=-log10(padj), 
                    col=DE, label=delabel)) + 
         geom_point() + ggtitle(paste0(d_group, " ", gene_strategy_name, 
                                       "\nhyp v.s. nor")) + 
         geom_text_repel(point.padding = 0.5,
                         min.segment.length = 0, 
                         size = 3,  # Adjust text size
                         box.padding = 0.5,
                         max.overlaps = 20, 
                         show.legend = FALSE) + 
         scale_color_manual(values=col2use)
    
  print(g2)
  
  saved_figure_name = paste0(d_group, "_", gene_strategy_name, 
                            "_hyp_vs_nor.svg")
  ggsave(file=file.path(volcano_figure_dir, saved_figure_name), 
         plot=g2, width=4, height=3, units="in")
}
## [1] "PrS_day_6_EPAS1_CE"
## 
##  Down    No    Up 
##  1624 17419  1894
## Warning: Removed 5277 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17419 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3505 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 5277 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17419 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3518 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_EPAS1_KO"
## 
##  Down    No    Up 
##  1302 18659  1216
## Warning: Removed 6159 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18659 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2516 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 6159 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18659 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 2516 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_EPAS1_PTC"
## 
##  Down    No    Up 
##  1809 17367  1855
## Warning: Removed 4485 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17367 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3659 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 4485 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 17367 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 3662 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## [1] "PrS_day_6_WT_WT"
## 
##  Down    No    Up 
##  3400 14497  3324
## Warning: Removed 2059 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 14497 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 6722 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 2059 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 14497 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 6724 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  7897315 421.8   12621380 674.1         NA 12621380 674.1
## Vcells 19392366 148.0   46420136 354.2      65536 46410901 354.1
sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] svglite_2.1.3               readxl_1.4.3               
##  [3] UpSetR_1.4.0                ComplexHeatmap_2.14.0      
##  [5] data.table_1.15.4           dplyr_1.1.4                
##  [7] ggpointdensity_0.1.0        viridis_0.6.5              
##  [9] viridisLite_0.4.2           DESeq2_1.38.3              
## [11] SummarizedExperiment_1.28.0 Biobase_2.58.0             
## [13] MatrixGenerics_1.10.0       matrixStats_1.3.0          
## [15] GenomicRanges_1.50.2        GenomeInfoDb_1.34.9        
## [17] IRanges_2.32.0              S4Vectors_0.36.2           
## [19] BiocGenerics_0.44.0         RColorBrewer_1.1-3         
## [21] MASS_7.3-60                 stringr_1.5.1              
## [23] ggpubr_0.6.0                ggrepel_0.9.5              
## [25] ggplot2_3.5.1              
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.1-1       ggsignif_0.6.4         rjson_0.2.21          
##  [4] circlize_0.4.16        XVector_0.38.0         GlobalOptions_0.1.2   
##  [7] clue_0.3-65            rstudioapi_0.16.0      farver_2.1.2          
## [10] bit64_4.0.5            AnnotationDbi_1.60.2   fansi_1.0.6           
## [13] codetools_0.2-20       doParallel_1.0.17      cachem_1.1.0          
## [16] geneplotter_1.76.0     knitr_1.48             jsonlite_1.8.8        
## [19] broom_1.0.6            annotate_1.76.0        cluster_2.1.6         
## [22] png_0.1-8              compiler_4.2.3         httr_1.4.7            
## [25] backports_1.5.0        Matrix_1.5-4.1         fastmap_1.2.0         
## [28] cli_3.6.3              htmltools_0.5.8.1      tools_4.2.3           
## [31] gtable_0.3.5           glue_1.7.0             GenomeInfoDbData_1.2.9
## [34] Rcpp_1.0.13            carData_3.0-5          cellranger_1.1.0      
## [37] jquerylib_0.1.4        vctrs_0.6.5            Biostrings_2.66.0     
## [40] iterators_1.0.14       xfun_0.47              lifecycle_1.0.4       
## [43] rstatix_0.7.2          XML_3.99-0.17          zlibbioc_1.44.0       
## [46] scales_1.3.0           ragg_1.2.7             parallel_4.2.3        
## [49] yaml_2.3.10            memoise_2.0.1          gridExtra_2.3         
## [52] sass_0.4.9             stringi_1.8.4          RSQLite_2.3.7         
## [55] highr_0.11             foreach_1.5.2          BiocParallel_1.32.6   
## [58] shape_1.4.6.1          rlang_1.1.4            pkgconfig_2.0.3       
## [61] systemfonts_1.1.0      bitops_1.0-8           evaluate_0.24.0       
## [64] lattice_0.22-6         purrr_1.0.2            labeling_0.4.3        
## [67] cowplot_1.1.3          bit_4.0.5              tidyselect_1.2.1      
## [70] plyr_1.8.9             magrittr_2.0.3         R6_2.5.1              
## [73] generics_0.1.3         DelayedArray_0.24.0    DBI_1.2.3             
## [76] pillar_1.9.0           withr_3.0.1            KEGGREST_1.38.0       
## [79] abind_1.4-5            RCurl_1.98-1.16        tibble_3.2.1          
## [82] crayon_1.5.3           car_3.1-2              utf8_1.2.4            
## [85] rmarkdown_2.28         GetoptLong_1.0.5       locfit_1.5-9.10       
## [88] blob_1.2.4             digest_0.6.37          xtable_1.8-4          
## [91] tidyr_1.3.1            textshaping_0.3.7      munsell_0.5.1         
## [94] bslib_0.8.0